#--------------------------------------------------------------------------------------
#
# penalty_opt.R - code to optimize the penalty function
#
# March 2015
# Richard Judson
#
# US EPA
# Questions, comments to: judson.richard@epa.gov, 919-541-3085
#
#--------------------------------------------------------------------------------------
library(grDevices)
library(RColorBrewer)
library(stringr)
library(pca3d)
library(openxlsx)

source("utils.R")
#===============================================================================
# Pathway-specific - start
#===============================================================================
PATHWAY <<- "ER"
NASSAY <<- 18
NRECEPTOR <<- 9
NRECEPTOR0 <<- 9
NRECEPTOR <<- 26

HEATMAP.CMETHOD <<- "ward.D"
NCONC <- 45
CONCLIST <<- c(1e-6,2.679636e-06,4.019455e-06,6.029182e-06,9.043773e-06,1.356566e-05,2.034849e-05,3.052273e-05,4.57841e-05,6.867615e-05,0.0001030142,0.0001545213,0.000231782,0.000347673,0.0005215095,0.0007822643,0.001173396,0.001760095,0.002640142,0.003960213,0.005940319,0.008910479,0.01336572,0.02004858,0.03007287,0.0451093,0.06766395,0.1014959,0.1522439,0.2283658,0.3425487,0.5138231,0.7707347,1.156102,1.734153,2.601229,3.901844,5.852766,8.77915,13.16872,19.75309,29.62963,44.44444,66.66667,100)
AUCSCALE2 <- 1
#--------------------------------------------------------------------------------------
#
# prepare the data to be optimized
#
#--------------------------------------------------------------------------------------
data.prep <- function() {
	Tmat.va()
	filename <- "../tuning/tuning_candidates_Z.xlsx"
	indata <- read.xlsx(filename)
	mask <- indata[,"Clean_Receptor_Z"]
	indata <- indata[!is.na(mask),]
	nchem <- dim(indata)[1]
	target.mat <- as.data.frame(matrix(nrow=nchem,ncol=NRECEPTOR))
	target.mat[] <- 0
	
	for(i in 1:NRECEPTOR) names(target.mat)[i] <- paste("R",i,sep="")
	rownames(target.mat) <- indata[,"CODE"]
	for(i in 1:nchem) {
		code <- indata[i,"CODE"]
		cname <- indata[i,"Name"]
		receptor <- indata[i,"Clean_Receptor_Z"]
		rid <- as.numeric(substr(receptor,2,nchar(receptor)))
		mask <- TMAT[,rid]
    	filename <- paste("../input/CRall/CRMAT_",code,".txt",sep="")
    	adata <- read.table(filename,header=T,sep="\t")
    	adata <- adata[,mask==1]
    	auc <- sum(adata)/sum(mask)/NCONC
    	if(i==1) auc.max <- auc
		cat(code,":",cname,":",receptor,":",rid,":",auc,"\n")
		flush.console()
		target.mat[code,rid] <- auc
	}
	filename <- "../tuning/target_mat_Z.xlsx"
	write.xlsx(target.mat,file=filename, row.names=T)
}
#--------------------------------------------------------------------------------------
#
# do the parameter scan
#
#--------------------------------------------------------------------------------------
parm.scan <- function(penalty.method="LASSO") {
	PENALTY.METHOD <<- penalty.method

	filename <- "../tuning/target_mat_Z.xlsx"
	target.mat <- read.xlsx(filename,rowNames=T)
	if(penalty.method=="RIDGE") alpha.list <- c(0.0001,0.0002,0.0005,0.001,0.002,0.003,0.004,0.005,0.006,0.007,0.008,0.009,0.01,0.02,0.03,0.04,0.05,0.1,0.2,0.5,1)
	if(penalty.method=="LASSO") alpha.list <- c(0.0001,0.0002,0.0005,0.001,0.002,0.003,0.004,0.005,0.006,0.007,0.008,0.009,0.01,0.02,0.03,0.04,0.05,0.1)
	if(penalty.method=="THRESHOLD") alpha.list <- c(0.001,0.003,0.005,0.007,0.009,0.01,0.03,0.05,0.07,0.09,0.1,0.3,0.5,1,3,5,10,1,3,5,10,30,50,100)
	code.list <- rownames(target.mat)
	nalpha <- length(alpha.list)
	nchem <- length(code.list)
	
	nchem <- 1
	ntest <- nalpha*nchem
	auc.mat <- as.data.frame(matrix(nrow=ntest,ncol=4+NRECEPTOR))
	name.list <- c("CODE","PENALTY.METHOD","ALPHA","AUCSCALE")
	for(i in 1:NRECEPTOR) {
		receptor <- paste("R",i,sep="")
		name.list <- c(name.list,receptor)
	}
	names(auc.mat) <- name.list
	counter <- 1
	AUCSCALE <<- 1
	for(i in 1:nalpha) {
		ALPHA <<- alpha.list[i]
		for(j in 1:nchem) {
			code <- code.list[j]
			cat(code," : ",ALPHA,"\n"); flush.console()
			ret <- run.model(code)
			auc.list <- vector(mode="numeric",length=NRECEPTOR)
			auc.list[] <- 0
			for(k in 1:NRECEPTOR) auc.list[k] <- receptor.score(ret[,k],T)
			auc.list <- colSums(ret)/NCONC
			auc.mat[counter,"CODE"] <- code
			auc.mat[counter,"PENALTY.METHOD"] <- PENALTY.METHOD
			auc.mat[counter,"AUCSCALE"] <- AUCSCALE
			auc.mat[counter,"ALPHA"] <- ALPHA
			for(k in 1:NRECEPTOR) auc.mat[counter,k+4] <- auc.list[k]
			counter <- counter+1
		}
	}
	filename <- paste("../tuning/auc_scan_",PENALTY.METHOD,"_Z.xlsx",sep="")
	write.xlsx(auc.mat,file=filename, row.names=F)		
}
#--------------------------------------------------------------------------------------
#
# do summary plots for teh errors
#
#--------------------------------------------------------------------------------------
error.plot <- function(penalty.method="LASSO",to.file=F) {
	filename <- paste("../tuning/auc_scan_",penalty.method,"_Z.xlsx",sep="")
	auc.mat <- read.xlsx(filename)
	alpha.list <- unique(auc.mat[,"ALPHA"])
	filename <- "../tuning/tuning_candidates_Z.xlsx"
	chems <- read.xlsx(filename)
	rownames(chems) <- chems[,"CODE"]
	
	filename <- "../tuning/target_mat_Z.xlsx"
	target.mat <- read.xlsx(filename,rowNames=T)

	xmin <- min(alpha.list)#1e-4
	xmax <- max(alpha.list) # 0.1
	ymin <- 1e-7
	ymax <- 0.1
	code.list <- sort(unique(auc.mat[,"CODE"]))
	nchem <- length(code.list)
	
    if(to.file) {
    	filename <- paste("../tuning/error_plot_",penalty.method,"_Z.pdf",sep="")
        pdf(file=filename,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    par(mfrow=c(2,1),mar=c(6,6,1,3))
	plot(1~1,type="n",xlab="alpha",ylab="RMS Error",xlim=c(xmin,xmax),ylim=c(ymin,ymax),log="xy",cex.lab=1,cex.axis=0.6,main=paste("Agonist / Antagonist",penalty.method),xaxp=c(xmin,xmax,2),yaxp=c(ymin,ymax,1))
	yval <- ymin
	for(i in 1:7) {
		lines(c(xmin,xmax),c(yval,yval),col="gray")
		yval <- yval*10
	}
	xval <- xmin
	for(i in 1:7) {
		lines(c(xval,xval),c(ymin,ymax),col="gray")
		xval <- xval*10
	}
	xval <- xmin*5
	for(i in 1:7) {
		lines(c(xval,xval),c(ymin,ymax),col="gray")
		xval <- xval*10
	}

	for(i in 1:nchem) {
		code <- code.list[i]
		cname <- chems[code,"Name"]
		receptor <- chems[code,"Clean_Receptor"]
		ir <- as.numeric(substr(receptor,2,nchar(receptor)))
		temp <- auc.mat[is.element(auc.mat[,"CODE"],code),]
		nconc <- dim(temp)[1]
		err.list <- vector(mode="numeric",length=nconc)
		alpha.list <- vector(mode="numeric",length=nconc)
		for(j in 1:nconc) {
			alpha.list[j] <- temp[j,"ALPHA"]
			if(alpha.list[j]==0) alpha.list[j] <- xmin
			vec.pred <- temp[j,5:(NRECEPTOR+4)]
			vec.true <- target.mat[code,]
			err.list[j] <- sum((vec.true-vec.pred)**2)/NRECEPTOR
		}
		cols <- c("blue","red", "black", "green", "gray","hotpink","cyan","yellow4","orange")
		doit <- T
		if(ir>=3) doit <- F 
#		if(max(err.list)/min(err.list)<10) doit <- F
		if(doit) lines(err.list~alpha.list,col=cols[ir],lwd=2)
	}


	plot(1~1,type="n",xlab="alpha",ylab="RMS Error",xlim=c(xmin,xmax),ylim=c(ymin,ymax),log="xy",cex.lab=1,cex.axis=0.6,main=paste("Pseudo-Receptors",penalty.method),xaxp=c(xmin,xmax,2),yaxp=c(ymin,ymax,1))
	yval <- ymin
	for(i in 1:7) {
		lines(c(xmin,xmax),c(yval,yval),col="gray")
		yval <- yval*10
	}
	xval <- xmin
	for(i in 1:7) {
		lines(c(xval,xval),c(ymin,ymax),col="gray")
		xval <- xval*10
	}
	xval <- xmin*5
	for(i in 1:7) {
		lines(c(xval,xval),c(ymin,ymax),col="gray")
		xval <- xval*10
	}
	
	for(i in 1:nchem) {
		code <- code.list[i]
		cname <- chems[code,"Name"]
		receptor <- chems[code,"Clean_Receptor"]
		ir <- as.numeric(substr(receptor,2,nchar(receptor)))
		temp <- auc.mat[is.element(auc.mat[,"CODE"],code),]
		nconc <- dim(temp)[1]
		err.list <- vector(mode="numeric",length=nconc)
		alpha.list <- vector(mode="numeric",length=nconc)
		for(j in 1:nconc) {
			alpha.list[j] <- temp[j,"ALPHA"]
			if(alpha.list[j]==0) alpha.list[j] <- xmin
			vec.pred <- temp[j,5:(NRECEPTOR+4)]
			vec.true <- target.mat[code,]
			err.list[j] <- sum((vec.true-vec.pred)**2)/NRECEPTOR
		}
		cols <- c("blue","red", "black", "green", "gray","hotpink","cyan","yellow4","orange")
		doit <- T
		if(ir<3) doit <- F 
#		if(max(err.list)/min(err.list)<10) doit <- F
		if(doit) lines(err.list~alpha.list,col=cols[ir],lwd=2)
	}
	if(to.file) dev.off()
}
#--------------------------------------------------------------------------------------
#
# creat heatmaps as a function of the alpha
#
#--------------------------------------------------------------------------------------
error.hm <- function(penalty.method="LASSO",to.file=F) {
	filename <- paste("../tuning/auc_scan_",penalty.method,"_Z.xlsx",sep="")
	auc.mat <- read.xlsx(filename)

	filename <- "../tuning/tuning_candidates_Z.xlsx"
	chems <- read.xlsx(filename)
	rownames(chems) <- chems[,"CODE"]
	
	filename <- "../tuning/target_mat_Z.xlsx"
	target.mat <- read.xlsx(filename,rowNames=T)

	xmin <- 1e-4
	xmax <- 0.1
	ymin <- 1e-7
	ymax <- 0.1
	code.list <- sort(unique(auc.mat[,"CODE"]))
	nchem <- length(code.list)
	rec.list <- c("Agonist","Antagonist")
	for(i in 3:9) rec.list <- c(rec.list,paste("R",i,sep=""))
	for(i in c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18)) rec.list <- c(rec.list,paste("A",i,sep=""))
	alpha.list <- sort(unique(auc.mat[,"ALPHA"]))
    if(to.file) {
    	filename <- paste("../tuning/error_hm_",penalty.method,"_Z.pdf",sep="")
        pdf(file=filename,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
   # par(mfrow=c(1,1),mar=c(6,6,1,3))
    for(i in 1:nchem) {
		code <- code.list[i]
		cname <- chems[code,"Name"]
		receptor <- chems[code,"Clean_Receptor"]
		main <- paste(code,":",cname,":",receptor,"\n",penalty.method)
		temp <- as.matrix(t(auc.mat[is.element(auc.mat[,"CODE"],code),5:(4+NRECEPTOR)]))
		heatmap(temp,margins=c(12,12),scale="none",labRow=rec.list,labCol=alpha.list,Rowv=NA,Colv=NA,
		        xlab="alpha",ylab="Receptor",cexCol=1,cexRow=1,col=brewer.pal(9,"Reds"),
		        hclustfun=function(x) hclust(d=dist(x),method="ward.D"),
		        main=main,cex.main=0.6)

		if(!to.file) browser()
	}

	if(to.file) dev.off()
}
#--------------------------------------------------------------------------------------
#
# Compare values for different alpha and penalty terms
#
#--------------------------------------------------------------------------------------
comp.plot <- function(to.file=F,do.all=F,receptor="R1") {
	penalty.method <- "THRESHOLD"
	alpha <- 1
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    ref.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
    if(to.file) {
    	filename <- paste("../tuning/comp_plot_",receptor,"_ALL.pdf",sep="")
    	if(!do.all) filename <- paste("../tuning/comp_plot_",receptor,"_SELECTED.pdf",sep="")
        pdf(file=filename,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }

	x <- ref.data[,receptor]
	plot(1~1,xlab=paste("AUC(",receptor,"|THRESHOLD,alpha=1)",sep=""),ylab=paste("AUC(",receptor,"|other conditions)",sep=""),xlim=c(0,1.2),ylim=c(0,1.2),
		 main=paste("Sensitivty of AUC(",receptor,") to Penalty",sep=""),cex.lab=1.0,cex.axis=1.0,type="n")
	lines(c(0,1.5),c(0,1.5))
	alpha <- 0.001
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    comp.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	y <- comp.data[,receptor]
	if(do.all) points(y~x,pch=25,bg="lightgray",cex=1)

	alpha <- 0.01
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    comp.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	y <- comp.data[,receptor]
	points(y~x,pch=24,bg="yellow")

	alpha <- 1
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    comp.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	y <- comp.data[,receptor]
	points(y~x,pch=24,bg="orange")

	alpha <- 10
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    comp.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	y <- comp.data[,receptor]
	points(y~x,pch=24,bg="khaki")

	alpha <- 100
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    comp.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	y <- comp.data[,receptor]
	points(y~x,pch=24,bg="red")

	penalty.method <- "LASSO"
	alpha <- 0.05
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    comp.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	y <- comp.data[,receptor]
	if(do.all) points(y~x,pch=21,bg="blue")

	alpha <- 0.01
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    comp.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	y <- comp.data[,receptor]
	points(y~x,pch=21,bg="skyblue")

	alpha <- 0.001
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    comp.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	y <- comp.data[,receptor]
	points(y~x,pch=21,bg="cyan")

	alpha <- 0.0001
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    comp.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	y <- comp.data[,receptor]
	if(do.all) points(y~x,pch=21,bg="green")

	penalty.method <- "RIDGE"
	alpha <- 0.001
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    comp.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	y <- comp.data[,receptor]
	if(do.all) points(y~x,pch=3,bg="violet")

	alpha <- 0.01
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    comp.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	y <- comp.data[,receptor]
	points(y~x,pch=4,bg="violet")

	alpha <- 0.1
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    comp.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	y <- comp.data[,receptor]
	if(do.all) points(y~x,pch=5,bg="violet")

	alpha <- 1
	filename <- paste("../output/refchem_AUC_",penalty.method,"_",alpha,".txt",sep="")
    comp.data <- read.table(filename,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	y <- comp.data[,receptor]
	if(do.all) points(y~x,pch=8,bg="violet")

	x <- 0.01
	y <- 1.2
	dy <- 0.05
	if(do.all) {points(x,y,pch=25,bg="lightgray"); text(x,y,"THRESHOLD 0.001",pos=4); y <- y-dy}
	points(x,y,pch=24,bg="yellow"); text(x,y,"THRESHOLD 0.01",pos=4); y <- y-dy
	if(do.all) points(x,y,pch=24,bg="orange"); text(x,y,"THRESHOLD 1",pos=4); y <- y-dy
	if(do.all) points(x,y,pch=24,bg="khaki"); text(x,y,"THRESHOLD 10",pos=4); y <- y-dy
	points(x,y,pch=24,bg="red"); text(x,y,"THRESHOLD 100",pos=4); y <- y-dy
	if(do.all) {points(x,y,pch=21,bg="blue"); text(x,y,"LASSO 0.05",pos=4); y <- y-dy}
	points(x,y,pch=21,bg="skyblue"); text(x,y,"LASSO 0.01",pos=4); y <- y-dy
	points(x,y,pch=21,bg="cyan"); text(x,y,"LASSO 0.001",pos=4); y <- y-dy
	if(do.all) {points(x,y,pch=21,bg="green"); text(x,y,"LASSO 0.0001",pos=4); y <- y-dy}
	if(do.all) {points(x,y,pch=3,bg="violet"); text(x,y,"RIDGE 0.001",pos=4); y <- y-dy}
	points(x,y,pch=4,bg="violet"); text(x,y,"RIDGE 0.01",pos=4); y <- y-dy
	if(do.all) {points(x,y,pch=5,bg="violet"); text(x,y,"RIDGE 0.1",pos=4); y <- y-dy}
	if(do.all) {points(x,y,pch=8,bg="violet"); text(x,y,"RIDGE 1",pos=4); y <- y-dy}
	
	if(to.file) dev.off()
}
#--------------------------------------------------------------------------------------
#
# run the bare model
#
#--------------------------------------------------------------------------------------
run.model <- function(code,do.debug=F) {
   	filename <- paste("../input/CRall/CRMAT_",code,".txt",sep="")
    adata <- read.table(filename,header=T,sep="\t")
    
    
    rmat <- matrix(nrow=NCONC,ncol=NASSAY)
    rmat[] <- 0
    adata <- as.data.frame(t(adata))
    adata <- cbind(adata,TMAT)
    conc.names <- c()
    for(i in 1:NCONC) conc.names <- c(conc.names,paste("C",i,sep=""))
    t.names <- c()
    for(i in 1:NRECEPTOR) t.names <- c(t.names,paste("T",i,sep=""))
    names(adata) <- c(conc.names,t.names)
    
    
    resmat <- as.data.frame(matrix(nrow=NCONC,ncol=NRECEPTOR))
    allrnames <- c()
    for(i in 1:NRECEPTOR) allrnames <- c(allrnames,paste("R",i,sep=""))
    names(resmat) <- allrnames
    resmat[] <- 0
    start <- vector(mode="numeric",length=NRECEPTOR)
    lwr <- vector(mode="numeric",length=NRECEPTOR)
    upr <- vector(mode="numeric",length=NRECEPTOR)
    start[] <- 0
    lwr[] <- 0
    upr[] <- 1
    for(i in 1:NCONC) {
        concname <- paste("C",i,sep="")
        A <- adata[,c(concname,t.names)]
        if(i>1) start <- res$par
        if(do.debug) res <- optim(par=start,f=AFR.va,A=A,method="L-BFGS-B",lower=lwr,upper=upr,control=list(maxit=2000))
        else res <- optim(par=start,f=AFR.va,A=A,method="L-BFGS-B",lower=lwr,upper=upr,control=list(maxit=2000))   
        
        for(j in 1:NRECEPTOR) resmat[i,j] <- res$par[j]
        if(res$convergence!=0 || do.debug) cat(i,"Convergence: ",res$convergence," Calls: ",res$counts," residual: ",res$value," : ",res$message,"\n")
   		if(do.debug) print(res$par)	
   	}
   	return(resmat)
}
#--------------------------------------------------------------------------------------
#
# penalty term
#
#--------------------------------------------------------------------------------------
penalty <- function(x) {
	if(PENALTY.METHOD=="RIDGE") value <- ALPHA * sum(x*x) # ridge regression
	if(PENALTY.METHOD=="LASSO") value <- ALPHA * sum(abs(x)) # LASSO regression
	if(PENALTY.METHOD=="THRESHOLD") {
		sumx <- sum(x)
		a <- sumx**10
		b <- 0.5**10
		value <- ALPHA * a/(a+b)
	}

	return(value)
}
#--------------------------------------------------------------------------------------
#
# receptor score or AUC
#
#--------------------------------------------------------------------------------------
receptor.score <- function(x,do.print=F) {
	nuse <- length(x)
	if(nuse<2) return(0)
	if(do.print) {
		cat("==========================================\n")
		print(x)
	}
	score <- x[1]
	for(i in 2:nuse) {
		slope.sign <- 1
		delta <- x[i]-x[i-1]
		if(delta < -0.01) slope.sign <- -1
		score <- score + slope.sign*x[i]
		if(do.print) cat("   ",i,"delta: ",format(delta,digits=3),"sign: ",slope.sign," x:",format(x[i],digits=3),"\n")
	}
	score <- score/nuse
	if(score<0) score <- 0
	if(do.print) cat("Score: ",format(score,digits=3),"\n")
	score <- score*AUCSCALE2
	return(score)
}
